InĀ [75]:
# impoted the the library which is essential for data analysis
import pandas as pd
InĀ [77]:
#reading the dataset from the loation
data=pd.read_csv("C:\\Users\\jafer\\Desktop\\yellow_tripdata_2015-01.csv")
InĀ [79]:
#displaying first 5 values fromthe dataset
data.head()
Out[79]:
| VendorID | tpep_pickup_datetime | tpep_dropoff_datetime | passenger_count | trip_distance | pickup_longitude | pickup_latitude | RateCodeID | dropoff_longitude | dropoff_latitude | payment_type | fare_amount | extra | mta_tax | tip_amount | tolls_amount | improvement_surcharge | total_amount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2 | 15/01/2015 19:05 | 15/01/2015 19:23 | 1 | 1.59 | -73.993896 | 40.750111 | 1 | -73.974785 | 40.750618 | 1 | 12.0 | 1.0 | 0.5 | 3.25 | 0.0 | 0.3 | 17.05 |
| 1 | 1 | 10/01/2015 20:33 | 10/01/2015 20:53 | 1 | 3.30 | -74.001648 | 40.724243 | 1 | -73.994415 | 40.759110 | 1 | 14.5 | 0.5 | 0.5 | 2.00 | 0.0 | 0.3 | 17.80 |
| 2 | 1 | 10/01/2015 20:33 | 10/01/2015 20:43 | 1 | 1.80 | -73.963341 | 40.802788 | 1 | -73.951820 | 40.824413 | 2 | 9.5 | 0.5 | 0.5 | 0.00 | 0.0 | 0.3 | 10.80 |
| 3 | 1 | 10/01/2015 20:33 | 10/01/2015 20:35 | 1 | 0.50 | -74.009087 | 40.713818 | 1 | -74.004326 | 40.719986 | 2 | 3.5 | 0.5 | 0.5 | 0.00 | 0.0 | 0.3 | 4.80 |
| 4 | 1 | 10/01/2015 20:33 | 10/01/2015 20:52 | 1 | 3.00 | -73.971176 | 40.762428 | 1 | -74.004181 | 40.742653 | 2 | 15.0 | 0.5 | 0.5 | 0.00 | 0.0 | 0.3 | 16.30 |
InĀ [81]:
#displaying the shape od the dataset which is the number of rows and columns
#the number of rows are 12748986
#the number of columns are 19
data.shape
Out[81]:
(572712, 18)
InĀ [83]:
#which gives the complete information about the type of data which we have in our dataset
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 572712 entries, 0 to 572711 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 VendorID 572712 non-null int64 1 tpep_pickup_datetime 572712 non-null object 2 tpep_dropoff_datetime 572712 non-null object 3 passenger_count 572712 non-null int64 4 trip_distance 572712 non-null float64 5 pickup_longitude 572712 non-null float64 6 pickup_latitude 572712 non-null float64 7 RateCodeID 572712 non-null int64 8 dropoff_longitude 572712 non-null float64 9 dropoff_latitude 572712 non-null float64 10 payment_type 572712 non-null int64 11 fare_amount 572712 non-null float64 12 extra 572712 non-null float64 13 mta_tax 572712 non-null float64 14 tip_amount 572712 non-null float64 15 tolls_amount 572712 non-null float64 16 improvement_surcharge 572712 non-null float64 17 total_amount 572712 non-null float64 dtypes: float64(12), int64(4), object(2) memory usage: 78.7+ MB
InĀ [85]:
#! pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
InĀ [87]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ydata_profiling
#import pandas_profiling
#from pandas_profiling import ProfileReport
from pandas_profiling import ydata_profiling
InĀ [89]:
data=pd.read_csv("C:\\Users\\jafer\\Desktop\\yellow_tripdata_2015-01.csv")
InĀ [91]:
data
Out[91]:
| VendorID | tpep_pickup_datetime | tpep_dropoff_datetime | passenger_count | trip_distance | pickup_longitude | pickup_latitude | RateCodeID | dropoff_longitude | dropoff_latitude | payment_type | fare_amount | extra | mta_tax | tip_amount | tolls_amount | improvement_surcharge | total_amount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2 | 15/01/2015 19:05 | 15/01/2015 19:23 | 1 | 1.59 | -73.993896 | 40.750111 | 1 | -73.974785 | 40.750618 | 1 | 12.0 | 1.0 | 0.5 | 3.25 | 0.0 | 0.3 | 17.05 |
| 1 | 1 | 10/01/2015 20:33 | 10/01/2015 20:53 | 1 | 3.30 | -74.001648 | 40.724243 | 1 | -73.994415 | 40.759110 | 1 | 14.5 | 0.5 | 0.5 | 2.00 | 0.0 | 0.3 | 17.80 |
| 2 | 1 | 10/01/2015 20:33 | 10/01/2015 20:43 | 1 | 1.80 | -73.963341 | 40.802788 | 1 | -73.951820 | 40.824413 | 2 | 9.5 | 0.5 | 0.5 | 0.00 | 0.0 | 0.3 | 10.80 |
| 3 | 1 | 10/01/2015 20:33 | 10/01/2015 20:35 | 1 | 0.50 | -74.009087 | 40.713818 | 1 | -74.004326 | 40.719986 | 2 | 3.5 | 0.5 | 0.5 | 0.00 | 0.0 | 0.3 | 4.80 |
| 4 | 1 | 10/01/2015 20:33 | 10/01/2015 20:52 | 1 | 3.00 | -73.971176 | 40.762428 | 1 | -74.004181 | 40.742653 | 2 | 15.0 | 0.5 | 0.5 | 0.00 | 0.0 | 0.3 | 16.30 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 572707 | 1 | 29/01/2015 19:55 | 29/01/2015 20:17 | 1 | 4.90 | -74.001946 | 40.715950 | 1 | -73.953331 | 40.771603 | 1 | 18.0 | 1.0 | 0.5 | 4.95 | 0.0 | 0.3 | 24.75 |
| 572708 | 1 | 29/01/2015 19:55 | 29/01/2015 20:01 | 1 | 1.20 | -73.983826 | 40.749725 | 1 | -73.995209 | 40.739670 | 2 | 6.0 | 0.5 | 0.5 | 0.00 | 0.0 | 0.3 | 7.30 |
| 572709 | 1 | 29/01/2015 19:55 | 29/01/2015 19:59 | 1 | 0.80 | -73.966621 | 40.764755 | 1 | -73.963760 | 40.773510 | 2 | 5.0 | 0.5 | 0.5 | 0.00 | 0.0 | 0.3 | 6.30 |
| 572710 | 1 | 29/01/2015 19:55 | 29/01/2015 19:59 | 1 | 0.90 | -73.997810 | 40.756504 | 1 | -73.987839 | 40.764717 | 1 | 5.0 | 1.0 | 0.5 | 1.35 | 0.0 | 0.3 | 8.15 |
| 572711 | 1 | 29/01/2015 19:55 | 29/01/2015 20:18 | 1 | 6.50 | -73.952454 | 40.777096 | 1 | -74.004990 | 40.731491 | 1 | 23.0 | 1.0 | 0.5 | 2.00 | 0.0 | 0.3 | 26.80 |
572712 rows Ć 18 columns
InĀ [93]:
Profile=ProfileReport(data,title='Pandas Profiling Report',explorative=True)
InĀ [33]:
Profile#.to_notebook_iframe()
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]